import numpy as npimport pandas as pdimport matplotlib as mplimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inline#matplotlib settingmpl.rcParams['figure.dpi']=200mpl.rcParams['axes.spines.top']=Falsempl.rcParams['axes.spines.right']=Falsexxxxxxxxxxdf=pd.read_csv("world-happiness-report-2021.csv")xxxxxxxxxxdfxxxxxxxxxxdf.head()xxxxxxxxxxdf.info()xxxxxxxxxxdf.describe()xxxxxxxxxxdf.describe().T.style.bar(subset=['mean'], color='#205ff2')\ .background_gradient(subset=['std'], cmap='Reds')\ .background_gradient(subset=['50%'], cmap='coolwarm') xxxxxxxxxx#Above data is the statistical information of the dataframe#Bright red is the standard deviation value 6.7620 because it is the highest amon standard deviations! Healthy Life expectancy have highest standard deviationxxxxxxxxxxplt.figure(figsize=(20,5))sns.heatmap(df.corr(),annot=True)xxxxxxxxxxhigh_corruption=df[['Country name','Perceptions of corruption','Healthy life expectancy']].sort_values('Perceptions of corruption', ascending=False)xxxxxxxxxximport plotly.express as pxxxxxxxxxxxfig=px.bar(high_corruption[:30],x='Country name', y='Perceptions of corruption',color='Healthy life expectancy', title='High corruption countries')fig.show()xxxxxxxxxx#Top 30 countries with highest corruptionAfghanistan, Lesotho, Nigeria, Sierra Leone has the least healthy life expectancy.xxxxxxxxxx##Let's see the top countries with least corruptionfig=px.bar(high_corruption[120:],x='Country name',y='Perceptions of corruption',color='Healthy life expectancy',title='High corruption countries')fig.show()xxxxxxxxxxlife_exp=df[['Country name','Freedom to make life choices','Healthy life expectancy']].sort_values('Healthy life expectancy',ascending=False)xxxxxxxxxxfig=px.bar(life_exp[:20],x='Country name',y='Healthy life expectancy',color='Freedom to make life choices',title='Low life expentancy and freedom for life choices')fig.show()xxxxxxxxxx#Singapore has the highest healthy life expectancy with value of 76.953 and freedom to make life choice is also very high .927xxxxxxxxxxfig = px.bar(life_exp[140:], x='Country name', y='Healthy life expectancy', color ='Freedom to make life choices', title = 'Low life expectancy and freedom for life choices')fig.show()xxxxxxxxxx#Chad, Lesotho, Nigeria have least healthy life expectancyxxxxxxxxxxsns.barplot(y='Regional indicator',x='Freedom to make life choices',data=df)xxxxxxxxxxfig=px.scatter(df,x='Regional indicator',y='Freedom to make life choices',color="Healthy life expectancy",size='Logged GDP per capita',hover_data=['Social support'])fig.show()xxxxxxxxxx#Move your cursor around the bubbles to get detailed information#For example if you put your cursor on the lowest bubble of South Asia you will get information like Freedom to make life choices = 0.382, Logged GDP per capita 7.695, Social support 0.463, Healthy life expectancy =52.493xxxxxxxxxxcols = ['Logged GDP per capita','Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Ladder score in Dystopia']sns.pairplot(df[cols], height = 3)xxxxxxxxxx#Logged GDP per capita has strong correlation with Healthy Life expectancy and Social support, somewhat good correlation with Freedom to make life choices#Social Support has strong correlation with Healthy Life expectancy and somewhat good correlation with Freedom to make life choices#Healthy Life expectancy has strong correlation with Logged GDP per capita and Social support, somewhat strong correlation with Freedom to make life choices#Freedom to make life choices has somewhat strong correlation with Logged GDP per capita, Healthy Life expectancy, Social support and Generosityxxxxxxxxxxfig = px.pie(df, values='Logged GDP per capita', names='Regional indicator', title='% of Logged GDP of regions from data')fig.show()xxxxxxxxxx#20.7% of data has Sub Saharan African region.#Log of real GDP per capita Natural logs have a few great properties for our purposes. Using them means that every step up the y-axis is an identical percent change in real GDP per capita. Going from 7.0 to 7.5, for example, is a 65% increase in real GDP per capita. Going from 7.5 to 8.0 is also a 65% increase in real GDP per capita.xxxxxxxxxxfig = px.density_heatmap(df, x="Freedom to make life choices", y='Perceptions of corruption', marginal_x="box", marginal_y="violin")fig.show()xxxxxxxxxx#Freedom to make life choices has median value .804#Perceptions of corruption has median value .781#The square boxes gives the estimate of freedom to make life choices and perceptions of corruptions with counts, again the reader is advised to move around the cursor to understand the valuesxxxxxxxxxxfig = px.scatter(df, x="Healthy life expectancy", y="Logged GDP per capita", color="Regional indicator",marginal_x="box")fig.show()xxxxxxxxxx#A simple scatter plot to understand the correlation between healthy life expectancy and logged GDP per capita along with the regions they represnt#One can notice the pink dots in the left and bottom represents sub saharan Africa with healthy life expectancy of 53.4xxxxxxxxxxfig = px.violin(df, y="Logged GDP per capita",x ="Generosity", color = 'Regional indicator', box=True, # draw box plot inside the violin points='all', hover_data=df.columns # can be 'outliers', or False )fig.show()xxxxxxxxxx#This maybe the most essential plot#move your cursor around the points and you will get all detailed for each point(includes all the informations like country name, logged GDP, generosity, etc all the columns)xxxxxxxxxxsns.set_style("darkgrid")xxxxxxxxxx#Ladder Score :#The rankings of national happiness are based on a Cantril ladder survey. Nationally representative samples of respondents are asked to think of a ladder, with the best possible life for them being a 10, and the worst possible life being a 0. They are then asked to rate their own current lives on that 0 to 10 scale.xxxxxxxxxxladder_score = df[['Ladder score',"Standard error of ladder score",'Country name']].sort_values("Ladder score",ascending=False)xxxxxxxxxxladder_score[:10]xxxxxxxxxx# Seeing top 12 and bottom 12fig = px.histogram(ladder_score.head(12), x = "Ladder score" , y = "Country name", color='Country name',title="Top 12 Ladder Scores."); fig.show()fig = px.histogram(ladder_score.tail(12), x = "Ladder score" , y = "Country name", color='Country name', title="Bottom 12 Ladder scores"); fig.show()xxxxxxxxxx#Findand has the highest and Afghanistan has the lowest score.xxxxxxxxxx#Logged GDP per capitagdp = df[["Logged GDP per capita", "Country name","Regional indicator"]].sort_values("Logged GDP per capita",ascending=False)# Top 10gdp.head(10)xxxxxxxxxx# Botttom 10gdp.tail(10)xxxxxxxxxxprint("Average GDP : {}".format(np.mean(gdp['Logged GDP per capita'])))xxxxxxxxxxfig = px.scatter(gdp, x = 'Logged GDP per capita', y = 'Logged GDP per capita', size='Logged GDP per capita', color = "Country name",hover_data=["Regional indicator"],title="GDP")fig.show()xxxxxxxxxx#Luxembourg has the highes GDP per capita at 11.647.xxxxxxxxxx#Social Supportsocial_support = df[["Social support","Country name"]].sort_values("Social support",ascending = False)social_support.tail(10)xxxxxxxxxxplt.figure(figsize = [20,10])plt.subplot(121); sns.barplot(data = social_support.head(5),x='Social support',y = "Country name"); plt.title("Top 5")plt.subplot(122); sns.barplot(data = social_support.tail(5),x='Social support',y = "Country name"); plt.title("Bottom 5");xxxxxxxxxx#Iceland again at top and Afghanistan again at bottom.xxxxxxxxxx#Healthy life expectancy & Freedom to make life choiceshlef = df[["Freedom to make life choices","Healthy life expectancy","Country name","Regional indicator"]].sort_values("Healthy life expectancy",ascending=False)hlef.head()xxxxxxxxxxfig = px.scatter(hlef, x = "Healthy life expectancy", y = "Freedom to make life choices", size = "Freedom to make life choices", color = "Regional indicator", hover_name="Country name")fig.show()xxxxxxxxxx#Sub-Saharan Africa seems to have very low healty life expectancy.xxxxxxxxxx#Generosity and Perceptions of corruptiongen_cor = df[["Perceptions of corruption","Generosity","Country name", "Regional indicator","Freedom to make life choices"]].sort_values("Perceptions of corruption",ascending=False)# Top corruptgen_cor.head(20)xxxxxxxxxx# Least corruptgen_cor.tail(20)xxxxxxxxxxfig = px.scatter_3d(gen_cor,x = "Generosity", y = "Freedom to make life choices" , z = "Perceptions of corruption", color="Regional indicator",hover_name="Country name",height=800,width=1000)fig.show()xxxxxxxxxx#There are a lot of corrupt countries compared with less corrupt ones.#Central and Eastern Europe seems to be mostly corrupt.#Most countries seem to fall on the negative side in terms of Generosity.xxxxxxxxxximport statsmodels.formula.api as smffrom scipy.stats import statsfrom scipy import statsimport pylab #ggplotfrom statsmodels.stats.multicomp import (pairwise_tukeyhsd,MultiComparison) #Tukeyimport warningswarnings.filterwarnings('ignore')xxxxxxxxxx#Descriptive Statisticssummary = df.describe().Tse = df.describe().T['std']/np.sqrt(df.describe().T['count'])summary.insert(3,'se',se)summary['Skewness'] = stats.skew(df._get_numeric_df(), nan_policy='omit')summary['Kurtosis'] = stats.kurtosis(df._get_numeric_df(), nan_policy='omit')summaryxxxxxxxxxxxxxxxxxxxx